/*
 *   This program is free software: you can redistribute it and/or modify
 *   it under the terms of the GNU General Public License as published by
 *   the Free Software Foundation, either version 3 of the License, or
 *   (at your option) any later version.
 *
 *   This program is distributed in the hope that it will be useful,
 *   but WITHOUT ANY WARRANTY; without even the implied warranty of
 *   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
 *   GNU General Public License for more details.
 *
 *   You should have received a copy of the GNU General Public License
 *   along with this program.  If not, see <http://www.gnu.org/licenses/>.
 */

/*
 *    MetaCost.java
 *    Copyright (C) 2002 University of Waikato, Hamilton, New Zealand
 *
 */

package weka.classifiers.meta;

import java.io.BufferedReader;
import java.io.File;
import java.io.FileReader;
import java.io.StringReader;
import java.io.StringWriter;
import java.util.Collections;
import java.util.Enumeration;
import java.util.Vector;

import weka.classifiers.Classifier;
import weka.classifiers.CostMatrix;
import weka.classifiers.RandomizableSingleClassifierEnhancer;
import weka.core.Capabilities;
import weka.core.Capabilities.Capability;
import weka.core.Instance;
import weka.core.Instances;
import weka.core.Option;
import weka.core.OptionHandler;
import weka.core.RevisionUtils;
import weka.core.SelectedTag;
import weka.core.Tag;
import weka.core.TechnicalInformation;
import weka.core.TechnicalInformation.Field;
import weka.core.TechnicalInformation.Type;
import weka.core.TechnicalInformationHandler;
import weka.core.Utils;

/**
 * <!-- globalinfo-start --> This metaclassifier makes its base classifier
 * cost-sensitive using the method specified in<br/>
 * <br/>
 * Pedro Domingos: MetaCost: A general method for making classifiers
 * cost-sensitive. In: Fifth International Conference on Knowledge Discovery and
 * Data Mining, 155-164, 1999.<br/>
 * <br/>
 * This classifier should produce similar results to one created by passing the
 * base learner to Bagging, which is in turn passed to a CostSensitiveClassifier
 * operating on minimum expected cost. The difference is that MetaCost produces
 * a single cost-sensitive classifier of the base learner, giving the benefits
 * of fast classification and interpretable output (if the base learner itself
 * is interpretable). This implementation uses all bagging iterations when
 * reclassifying training data (the MetaCost paper reports a marginal
 * improvement when only those iterations containing each training instance are
 * used in reclassifying that instance).
 * <p/>
 * <!-- globalinfo-end -->
 * 
 * <!-- technical-bibtex-start --> BibTeX:
 * 
 * <pre>
 * &#64;inproceedings{Domingos1999,
 *    author = {Pedro Domingos},
 *    booktitle = {Fifth International Conference on Knowledge Discovery and Data Mining},
 *    pages = {155-164},
 *    title = {MetaCost: A general method for making classifiers cost-sensitive},
 *    year = {1999}
 * }
 * </pre>
 * <p/>
 * <!-- technical-bibtex-end -->
 * 
 * <!-- options-start --> Valid options are:
 * <p/>
 * 
 * <pre>
 * -I &lt;num&gt;
 *  Number of bagging iterations.
 *  (default 10)
 * </pre>
 * 
 * <pre>
 * -C &lt;cost file name&gt;
 *  File name of a cost matrix to use. If this is not supplied,
 *  a cost matrix will be loaded on demand. The name of the
 *  on-demand file is the relation name of the training data
 *  plus ".cost", and the path to the on-demand file is
 *  specified with the -N option.
 * </pre>
 * 
 * <pre>
 * -N &lt;directory&gt;
 *  Name of a directory to search for cost files when loading
 *  costs on demand (default current directory).
 * </pre>
 * 
 * <pre>
 * -cost-matrix &lt;matrix&gt;
 *  The cost matrix in Matlab single line format.
 * </pre>
 * 
 * <pre>
 * -P
 *  Size of each bag, as a percentage of the
 *  training set size. (default 100)
 * </pre>
 * 
 * <pre>
 * -S &lt;num&gt;
 *  Random number seed.
 *  (default 1)
 * </pre>
 * 
 * <pre>
 * -D
 *  If set, classifier is run in debug mode and
 *  may output additional info to the console
 * </pre>
 * 
 * <pre>
 * -W
 *  Full name of base classifier.
 *  (default: weka.classifiers.rules.ZeroR)
 * </pre>
 * 
 * <pre>
 * Options specific to classifier weka.classifiers.rules.ZeroR:
 * </pre>
 * 
 * <pre>
 * -D
 *  If set, classifier is run in debug mode and
 *  may output additional info to the console
 * </pre>
 * 
 * <!-- options-end -->
 * 
 * Options after -- are passed to the designated classifier.
 * <p>
 * 
 * @author Len Trigg (len@reeltwo.com)
 * @version $Revision$
 */
public class MetaCost extends RandomizableSingleClassifierEnhancer implements
  TechnicalInformationHandler {

  /** for serialization */
  static final long serialVersionUID = 1205317833344726855L;

  /** load cost matrix on demand */
  public static final int MATRIX_ON_DEMAND = 1;
  /** use explicit matrix */
  public static final int MATRIX_SUPPLIED = 2;
  /** Specify possible sources of the cost matrix */
  public static final Tag[] TAGS_MATRIX_SOURCE = {
    new Tag(MATRIX_ON_DEMAND, "Load cost matrix on demand"),
    new Tag(MATRIX_SUPPLIED, "Use explicit cost matrix") };

  /** Indicates the current cost matrix source */
  protected int m_MatrixSource = MATRIX_ON_DEMAND;

  /**
   * The directory used when loading cost files on demand, null indicates
   * current directory
   */
  protected File m_OnDemandDirectory = new File(System.getProperty("user.dir"));

  /** The name of the cost file, for command line options */
  protected String m_CostFile;

  /** The cost matrix */
  protected CostMatrix m_CostMatrix = new CostMatrix(1);

  /** The number of iterations. */
  protected int m_NumIterations = 10;

  /** The size of each bag sample, as a percentage of the training size */
  protected int m_BagSizePercent = 100;

  /**
   * Returns a string describing classifier
   * 
   * @return a description suitable for displaying in the explorer/experimenter
   *         gui
   */
  public String globalInfo() {

    return "This metaclassifier makes its base classifier cost-sensitive using the "
      + "method specified in\n\n"
      + getTechnicalInformation().toString()
      + "\n\n"
      + "This classifier should produce similar results to one created by "
      + "passing the base learner to Bagging, which is in turn passed to a "
      + "CostSensitiveClassifier operating on minimum expected cost. The difference "
      + "is that MetaCost produces a single cost-sensitive classifier of the "
      + "base learner, giving the benefits of fast classification and interpretable "
      + "output (if the base learner itself is interpretable). This implementation  "
      + "uses all bagging iterations when reclassifying training data (the MetaCost "
      + "paper reports a marginal improvement when only those iterations containing "
      + "each training instance are used in reclassifying that instance).";

  }

  /**
   * Returns an instance of a TechnicalInformation object, containing detailed
   * information about the technical background of this class, e.g., paper
   * reference or book this class is based on.
   * 
   * @return the technical information about this class
   */
  @Override
  public TechnicalInformation getTechnicalInformation() {
    TechnicalInformation result;

    result = new TechnicalInformation(Type.INPROCEEDINGS);
    result.setValue(Field.AUTHOR, "Pedro Domingos");
    result.setValue(Field.TITLE,
      "MetaCost: A general method for making classifiers cost-sensitive");
    result.setValue(Field.BOOKTITLE,
      "Fifth International Conference on Knowledge Discovery and Data Mining");
    result.setValue(Field.YEAR, "1999");
    result.setValue(Field.PAGES, "155-164");

    return result;
  }

  /**
   * Returns an enumeration describing the available options.
   * 
   * @return an enumeration of all the available options.
   */
  @Override
  public Enumeration<Option> listOptions() {

    Vector<Option> newVector = new Vector<Option>(5);

    newVector.addElement(new Option("\tNumber of bagging iterations.\n"
      + "\t(default 10)", "I", 1, "-I <num>"));
    newVector.addElement(new Option(
      "\tFile name of a cost matrix to use. If this is not supplied,\n"
        + "\ta cost matrix will be loaded on demand. The name of the\n"
        + "\ton-demand file is the relation name of the training data\n"
        + "\tplus \".cost\", and the path to the on-demand file is\n"
        + "\tspecified with the -N option.", "C", 1, "-C <cost file name>"));
    newVector.addElement(new Option(
      "\tName of a directory to search for cost files when loading\n"
        + "\tcosts on demand (default current directory).", "N", 1,
      "-N <directory>"));
    newVector.addElement(new Option(
      "\tThe cost matrix in Matlab single line format.", "cost-matrix", 1,
      "-cost-matrix <matrix>"));
    newVector.addElement(new Option(
      "\tSize of each bag, as a percentage of the\n"
        + "\ttraining set size. (default 100)", "P", 1, "-P"));

    newVector.addAll(Collections.list(super.listOptions()));

    return newVector.elements();
  }

  /**
   * Parses a given list of options.
   * <p/>
   * 
   * <!-- options-start --> Valid options are:
   * <p/>
   * 
   * <pre>
   * -I &lt;num&gt;
   *  Number of bagging iterations.
   *  (default 10)
   * </pre>
   * 
   * <pre>
   * -C &lt;cost file name&gt;
   *  File name of a cost matrix to use. If this is not supplied,
   *  a cost matrix will be loaded on demand. The name of the
   *  on-demand file is the relation name of the training data
   *  plus ".cost", and the path to the on-demand file is
   *  specified with the -N option.
   * </pre>
   * 
   * <pre>
   * -N &lt;directory&gt;
   *  Name of a directory to search for cost files when loading
   *  costs on demand (default current directory).
   * </pre>
   * 
   * <pre>
   * -cost-matrix &lt;matrix&gt;
   *  The cost matrix in Matlab single line format.
   * </pre>
   * 
   * <pre>
   * -P
   *  Size of each bag, as a percentage of the
   *  training set size. (default 100)
   * </pre>
   * 
   * <pre>
   * -S &lt;num&gt;
   *  Random number seed.
   *  (default 1)
   * </pre>
   * 
   * <pre>
   * -D
   *  If set, classifier is run in debug mode and
   *  may output additional info to the console
   * </pre>
   * 
   * <pre>
   * -W
   *  Full name of base classifier.
   *  (default: weka.classifiers.rules.ZeroR)
   * </pre>
   * 
   * <pre>
   * Options specific to classifier weka.classifiers.rules.ZeroR:
   * </pre>
   * 
   * <pre>
   * -D
   *  If set, classifier is run in debug mode and
   *  may output additional info to the console
   * </pre>
   * 
   * <!-- options-end -->
   * 
   * Options after -- are passed to the designated classifier.
   * <p>
   * 
   * @param options the list of options as an array of strings
   * @throws Exception if an option is not supported
   */
  @Override
  public void setOptions(String[] options) throws Exception {

    String bagIterations = Utils.getOption('I', options);
    if (bagIterations.length() != 0) {
      setNumIterations(Integer.parseInt(bagIterations));
    } else {
      setNumIterations(10);
    }

    String bagSize = Utils.getOption('P', options);
    if (bagSize.length() != 0) {
      setBagSizePercent(Integer.parseInt(bagSize));
    } else {
      setBagSizePercent(100);
    }

    String costFile = Utils.getOption('C', options);
    if (costFile.length() != 0) {
      setCostMatrix(new CostMatrix(new BufferedReader(new FileReader(costFile))));
      setCostMatrixSource(new SelectedTag(MATRIX_SUPPLIED, TAGS_MATRIX_SOURCE));
      m_CostFile = costFile;
    } else {
      setCostMatrixSource(new SelectedTag(MATRIX_ON_DEMAND, TAGS_MATRIX_SOURCE));
    }

    String demandDir = Utils.getOption('N', options);
    if (demandDir.length() != 0) {
      setOnDemandDirectory(new File(demandDir));
    }

    String cost_matrix = Utils.getOption("cost-matrix", options);
    if (cost_matrix.length() != 0) {
      StringWriter writer = new StringWriter();
      CostMatrix.parseMatlab(cost_matrix).write(writer);
      setCostMatrix(new CostMatrix(new StringReader(writer.toString())));
      setCostMatrixSource(new SelectedTag(MATRIX_SUPPLIED, TAGS_MATRIX_SOURCE));
    }

    super.setOptions(options);

    Utils.checkForRemainingOptions(options);
  }

  /**
   * Gets the current settings of the Classifier.
   * 
   * @return an array of strings suitable for passing to setOptions
   */
  @Override
  public String[] getOptions() {

    Vector<String> options = new Vector<String>();

    if (m_MatrixSource == MATRIX_SUPPLIED) {
      if (m_CostFile != null) {
        options.add("-C");
        options.add("" + m_CostFile);
      } else {
        options.add("-cost-matrix");
        options.add(getCostMatrix().toMatlab());
      }
    } else {
      options.add("-N");
      options.add("" + getOnDemandDirectory());
    }
    options.add("-I");
    options.add("" + getNumIterations());
    options.add("-P");
    options.add("" + getBagSizePercent());

    Collections.addAll(options, super.getOptions());

    return options.toArray(new String[0]);
  }

  /**
   * Returns the tip text for this property
   * 
   * @return tip text for this property suitable for displaying in the
   *         explorer/experimenter gui
   */
  public String costMatrixSourceTipText() {
    return "Gets the source location method of the cost matrix. Will "
      + "be one of MATRIX_ON_DEMAND or MATRIX_SUPPLIED.";
  }

  /**
   * Gets the source location method of the cost matrix. Will be one of
   * MATRIX_ON_DEMAND or MATRIX_SUPPLIED.
   * 
   * @return the cost matrix source.
   */
  public SelectedTag getCostMatrixSource() {

    return new SelectedTag(m_MatrixSource, TAGS_MATRIX_SOURCE);
  }

  /**
   * Sets the source location of the cost matrix. Values other than
   * MATRIX_ON_DEMAND or MATRIX_SUPPLIED will be ignored.
   * 
   * @param newMethod the cost matrix location method.
   */
  public void setCostMatrixSource(SelectedTag newMethod) {

    if (newMethod.getTags() == TAGS_MATRIX_SOURCE) {
      m_MatrixSource = newMethod.getSelectedTag().getID();
    }
  }

  /**
   * Returns the tip text for this property
   * 
   * @return tip text for this property suitable for displaying in the
   *         explorer/experimenter gui
   */
  public String onDemandDirectoryTipText() {
    return "Name of directory to search for cost files when loading "
      + "costs on demand.";
  }

  /**
   * Returns the directory that will be searched for cost files when loading on
   * demand.
   * 
   * @return The cost file search directory.
   */
  public File getOnDemandDirectory() {

    return m_OnDemandDirectory;
  }

  /**
   * Sets the directory that will be searched for cost files when loading on
   * demand.
   * 
   * @param newDir The cost file search directory.
   */
  public void setOnDemandDirectory(File newDir) {

    if (newDir.isDirectory()) {
      m_OnDemandDirectory = newDir;
    } else {
      m_OnDemandDirectory = new File(newDir.getParent());
    }
    m_MatrixSource = MATRIX_ON_DEMAND;
  }

  /**
   * Returns the tip text for this property
   * 
   * @return tip text for this property suitable for displaying in the
   *         explorer/experimenter gui
   */
  public String bagSizePercentTipText() {
    return "The size of each bag, as a percentage of the training set "
      + "size.";
  }

  /**
   * Gets the size of each bag, as a percentage of the training set size.
   * 
   * @return the bag size, as a percentage.
   */
  public int getBagSizePercent() {

    return m_BagSizePercent;
  }

  /**
   * Sets the size of each bag, as a percentage of the training set size.
   * 
   * @param newBagSizePercent the bag size, as a percentage.
   */
  public void setBagSizePercent(int newBagSizePercent) {

    m_BagSizePercent = newBagSizePercent;
  }

  /**
   * Returns the tip text for this property
   * 
   * @return tip text for this property suitable for displaying in the
   *         explorer/experimenter gui
   */
  public String numIterationsTipText() {
    return "The number of bagging iterations.";
  }

  /**
   * Sets the number of bagging iterations
   * 
   * @param numIterations the number of iterations to use
   */
  public void setNumIterations(int numIterations) {

    m_NumIterations = numIterations;
  }

  /**
   * Gets the number of bagging iterations
   * 
   * @return the maximum number of bagging iterations
   */
  public int getNumIterations() {

    return m_NumIterations;
  }

  /**
   * Returns the tip text for this property
   * 
   * @return tip text for this property suitable for displaying in the
   *         explorer/experimenter gui
   */
  public String costMatrixTipText() {
    return "A misclassification cost matrix.";
  }

  /**
   * Gets the misclassification cost matrix.
   * 
   * @return the cost matrix
   */
  public CostMatrix getCostMatrix() {

    return m_CostMatrix;
  }

  /**
   * Sets the misclassification cost matrix.
   * 
   * @param newCostMatrix the cost matrix
   */
  public void setCostMatrix(CostMatrix newCostMatrix) {

    m_CostMatrix = newCostMatrix;
    m_MatrixSource = MATRIX_SUPPLIED;
  }

  /**
   * Returns default capabilities of the classifier.
   * 
   * @return the capabilities of this classifier
   */
  @Override
  public Capabilities getCapabilities() {
    Capabilities result = super.getCapabilities();

    // class
    result.disableAllClasses();
    result.disableAllClassDependencies();
    result.enable(Capability.NOMINAL_CLASS);

    return result;
  }

  /**
   * Builds the model of the base learner.
   * 
   * @param data the training data
   * @throws Exception if the classifier could not be built successfully
   */
  @Override
  public void buildClassifier(Instances data) throws Exception {

    // can classifier handle the data?
    getCapabilities().testWithFail(data);

    // remove instances with missing class
    data = new Instances(data);
    data.deleteWithMissingClass();

    if (m_MatrixSource == MATRIX_ON_DEMAND) {
      String costName = data.relationName() + CostMatrix.FILE_EXTENSION;
      File costFile = new File(getOnDemandDirectory(), costName);
      if (!costFile.exists()) {
        throw new Exception("On-demand cost file doesn't exist: " + costFile);
      }
      setCostMatrix(new CostMatrix(new BufferedReader(new FileReader(costFile))));
    }

    // Set up the bagger
    Bagging bagger = new Bagging();
    bagger.setClassifier(getClassifier());
    bagger.setSeed(getSeed());
    bagger.setNumIterations(getNumIterations());
    bagger.setBagSizePercent(getBagSizePercent());
    bagger.buildClassifier(data);

    // Use the bagger to reassign class values according to minimum expected
    // cost
    Instances newData = new Instances(data);
    for (int i = 0; i < newData.numInstances(); i++) {
      Instance current = newData.instance(i);
      double[] pred = bagger.distributionForInstance(current);
      int minCostPred = Utils.minIndex(m_CostMatrix.expectedCosts(pred));
      current.setClassValue(minCostPred);
    }

    // Build a classifier using the reassigned data
    m_Classifier.buildClassifier(newData);
  }

  /**
   * Classifies a given instance after filtering.
   * 
   * @param instance the instance to be classified
   * @return the class distribution for the given instance
   * @throws Exception if instance could not be classified successfully
   */
  @Override
  public double[] distributionForInstance(Instance instance) throws Exception {
    return m_Classifier.distributionForInstance(instance);
  }

  /**
   * Gets the classifier specification string, which contains the class name of
   * the classifier and any options to the classifier
   * 
   * @return the classifier string.
   */
  @Override
  protected String getClassifierSpec() {

    Classifier c = getClassifier();
    return c.getClass().getName() + " "
      + Utils.joinOptions(((OptionHandler) c).getOptions());
  }

  /**
   * Output a representation of this classifier
   * 
   * @return a string representaiton of the classifier
   */
  @Override
  public String toString() {

    if (m_Classifier == null) {
      return "MetaCost: No model built yet.";
    }

    String result = "MetaCost cost sensitive classifier induction";
    result += "\nOptions: " + Utils.joinOptions(getOptions());
    result += "\nBase learner: " + getClassifierSpec()
      + "\n\nClassifier Model\n" + m_Classifier.toString()
      + "\n\nCost Matrix\n" + m_CostMatrix.toString();

    return result;
  }

  /**
   * Returns the revision string.
   * 
   * @return the revision
   */
  @Override
  public String getRevision() {
    return RevisionUtils.extract("$Revision$");
  }

  /**
   * Main method for testing this class.
   * 
   * @param argv should contain the following arguments: -t training file [-T
   *          test file] [-c class index]
   */
  public static void main(String[] argv) {
    runClassifier(new MetaCost(), argv);
  }
}
